import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import pandas as pd
from typing import List
from numpy.linalg import norm
from IPython.core.formatters import Dict
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
import numpy as np
from matplotlib.patches import Ellipse
import pickle
# Generate the 2D dataset using sklearn
# Create the moon datasets of varying sizes
n_samples_small = 100
n_samples_medium = 500
n_samples_large = 1000
#let's plot the dataset["medium"]
# X, _ = make_moons(n_samples=500, noise=0.05, random_state=0)
Xsmall,_=make_moons(n_samples=n_samples_small, noise=0.05, random_state=0)
Xmid,_=make_moons(n_samples=n_samples_medium, noise=0.05, random_state=0)
Xlarge,_=make_moons(n_samples=n_samples_large, noise=0.05, random_state=0)
datasets=datasets = {
"Small": Xsmall,
"Medium": Xmid,
"Large": Xlarge,
}
# Plot any of the dataset
plt.figure(figsize=(8, 6))
plt.scatter(datasets["Medium"][:,0], datasets["Medium"][:,1], c='blue', label='make_moons dataset', alpha=0.5)
<matplotlib.collections.PathCollection at 0x7c741e2df6a0>
def train_histogram(X, bins=20):
x_min, x_max = X[:, 0].min(), X[:, 0].max()
y_min, y_max = X[:, 1].min(), X[:, 1].max()
x_bins = np.linspace(x_min, x_max, bins + 1)
y_bins = np.linspace(y_min, y_max, bins + 1)
histogram = np.zeros((bins, bins), dtype=int)
for i in range(bins):
for j in range(bins):
x_condition = np.logical_and(x_bins[i] <= X[:, 0], X[:, 0] < x_bins[i + 1])
y_condition = np.logical_and(y_bins[j] <= X[:, 1], X[:, 1] < y_bins[j + 1])
bin_count = np.sum(np.logical_and(x_condition, y_condition))
histogram[i, j] = bin_count
return histogram, x_bins, y_bins
# Train histogram for a specific dataset (e.g., the small one)
X_small = datasets["Large"]
histogram_small, x_bins, y_bins = train_histogram(X_small)
# You can visualize the histogram if needed
plt.imshow(histogram_small, origin='lower', cmap='Blues')
plt.colorbar()
plt.title('2D Histogram')
plt.show()
def generate_samples_from_histogram(histogram, x_bins, y_bins, num_samples):
# Normalize the histogram to obtain a probability distribution
histogram_normalized = histogram / np.sum(histogram)
# Get the shape of the histogram
num_x_bins, num_y_bins = histogram.shape
# Generate random indices based on the normalized histogram
random_indices = np.unravel_index(
np.random.choice(histogram.size, size=num_samples, p=histogram_normalized.ravel()),
(num_x_bins, num_y_bins)
)
# Map the random indices to bin centers
x_min, x_max = x_bins[0], x_bins[-1]
y_min, y_max = y_bins[0], y_bins[-1]
x_bin_centers = (x_bins[:-1] + x_bins[1:]) / 2
y_bin_centers = (y_bins[:-1] + y_bins[1:]) / 2
x_samples = x_bin_centers[random_indices[0]]
y_samples = y_bin_centers[random_indices[1]]
return np.column_stack((x_samples, y_samples))
# Example of generating data samples from the small histogram
num_samples_to_generate = 1000
generated_samples = generate_samples_from_histogram(histogram_small, x_bins, y_bins, num_samples_to_generate)
plt.scatter(generated_samples[:, 0], generated_samples[:, 1], c='red', marker='x', label='Generated Samples')
# You can also overlay the original data points for comparison
plt.scatter(X_small[:, 0], X_small[:, 1], c='blue', marker='o', label='Original Data')
# Set labels and legend
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()
# Show the plot
plt.show()
Conceptual difference in implementation:
1.1. The sample solution have implemented a grid-based approach. It defines bin edges along both dimensions (x and y) and counts data points falling into each bin. The histogram is constructed based on the occurrences within each bin. 1.2. In our solution, we have implemented a range-based histogram. It counts occurrences by checking each data point against bin edges.It uses conditions to determine the appropriate bin for each data point.
The grid based appraoch provides a more regular and structured representation and is suitable for capturing overall trends, whereas the range based approach allows to capture the specific characteristics of the data distribution more thoroughly.
In the figure above, we have considered the a dataset with 1000 datapoints (We would like to point out that there is some mistake in naming the variable above. We named our dataset X_small even though we have considered the case of large dataset (number of datapoints=1000). We apologise for this mistake). Our results well with the sample solution's plot.
### define n- dimensional gaussian distribution
def gaussian_multivar(x:np.array,mean:np.array,cov_mat:np.array):
'''
This model will evaluate the probabilty at the given x_vector using the GMM model.
Args:
1. x= np array of shape (1 X dimn_of_the_problem)
2. mean= np array of shape (1 X dimn_of_the_problem)
3.cov_mat= square np array of shape (dimn_of_the_prob)
'''
dimn=np.shape(x)[0]
inv_cov=np.linalg.inv(cov_mat)
x_minus_mu=x-mean
# argument for the exp
arg_for_expn=0.5*(x_minus_mu @ inv_cov @ np.transpose(x_minus_mu))[0,0]
det_cov=np.linalg.det(cov_mat)
# evaluating the prob
prob=np.exp(-arg_for_expn) * (1./(np.sqrt(det_cov*((2*np.pi)**dimn))))
return prob
import numpy as np
from scipy.linalg import cholesky
from scipy.special import erfinv
def sample_multivariate_gaussian(mean, covariance, num_samples=1):
"""
Sample from an n-dimensional Gaussian distribution.
Args:
mean: Mean vector (1D array) of length n. (1,)
covariance: Covariance matrix (2D array) of shape (n, n).
num_samples: Number of samples to generate (default is 1).
Returns:
samples: Array of shape (num_samples, n) containing generated samples.
"""
n = len(mean)
# Generate random vectors 'u' from a uniform distribution
u = np.random.rand(num_samples, n)
# Use the inverse error function (probit function) to transform 'u' into standard normal samples
z_standard_normal = np.sqrt(2) * erfinv(2 * u - 1)
# Apply the Cholesky decomposition to the covariance matrix
L = cholesky(covariance, lower=True)
# Transform standard normal samples into multivariate Gaussian samples
samples = np.dot(z_standard_normal, L.T) + mean
return samples
# Example usage:
# mean_vector = np.array([0.0, 0.0])
# covariance_matrix = np.array([[1.0, 0.5], [0.5, 1.0]])
# generated_samples = sample_multivariate_gaussian(mean_vector, covariance_matrix, num_samples=100)
in our case, the best fitting parameter would be the mean vector and the covariance matrix of the given dataset
def fit_gaussian_to_data(X):
# Calculate the mean and covariance matrix
mean = np.mean(X, axis=0)
cov_matrix = np.cov(X, rowvar=False)
return (mean, cov_matrix)
fitted_params={}
fitted_params['Small']=fit_gaussian_to_data(datasets["Small"])
fitted_params['Medium']=fit_gaussian_to_data(datasets["Medium"])
fitted_params['Large']=fit_gaussian_to_data(datasets["Large"])
sample_small=sample_multivariate_gaussian(fitted_params['Small'][0], fitted_params['Small'][1],num_samples=100)
sample_medium=sample_multivariate_gaussian(fitted_params['Small'][0], fitted_params['Small'][1],num_samples=100)
sample_high=sample_multivariate_gaussian(fitted_params['Small'][0], fitted_params['Small'][1],num_samples=100)
# Plot
plt.figure(figsize=(8, 6))
plt.scatter(datasets['Small'][:,0], datasets['Small'][:, 1], c='blue', label='make_moons dataset: small', alpha=0.5)
plt.scatter(sample_small[:,0],sample_small[:,1],c='red',label="sampled",alpha=0.5)
plt.title("gaussian fitted to 100 datapoints")
plt.legend()
plt.show()
plt.figure(figsize=(8, 6))
plt.scatter(datasets['Medium'][:,0], datasets['Medium'][:, 1], c='blue', label='make_moons dataset: medium', alpha=0.5)
plt.scatter(sample_medium[:,0],sample_medium[:,1],c='red',label="sampled",alpha=0.5)
plt.title("gaussian fitted to 500 datapoints")
plt.legend()
plt.show()
plt.figure(figsize=(8, 6))
plt.scatter(datasets['Large'][:,0], datasets['Large'][:, 1], c='blue', label='make_moons dataset: large', alpha=0.5)
plt.scatter(sample_high[:,0],sample_high[:,1],c='red',label="sampled",alpha=0.5)
plt.title("gaussian fitted to 1000 datapoints")
plt.legend()
plt.show()
Both the codes (sample and ours) achieve the task of defining a Gaussian distribution class, fitting it to data, and generating samples. However, the most important difference between the two is the way samples are generated:
sample code: It uses eigenvalue decomposition to transform samples from a standard normal distribution to match the covariance matrix of the Gaussian distribution. It follows a more complex but mathematically sound method for generating samples that respect the covariance structure.
Our code: It uses the Cholesky decomposition method for generating samples from a multivariate Gaussian distribution. Cholesky decomposition is a standard and efficient method for generating correlated multivariate normal samples.
If simplicity and widely accepted methods are a priority, our code might be favored. If a more advanced method with eigenvalue decomposition is preferred, the sample solution is a good choice.
As far as the way code is implemented is concerned, we have followed more of a modular appraoch to implement our code (unlike the sample solution, which is more suitable for use in scikit-learn pipelines.)
def gaussian_mixture_model(x_input:np.array, mixing_coeffs:np.array, mean_array:np.array, covs:Dict):
'''
Args:
1. x_input: x vector at which one wants to evaluate the GMM model.
A numpy array (row vector) of shape (1 X dimn of the problem)
2. mean: A numpy arrray of shape (L X dimn of the problem instance)
3. mixing coeffs: a numpy array of shape (1 X L)
4. covs: dict of covaraince matrices, L number of entries, one for each of the P_l(x).
'''
L=np.shape(mean_array)[0]
dimn_of_the_prob=np.shape(x_input)[1]
list_gmm=[mixing_coeffs[0,idx_el] * gaussian_multivar(x=x_input, mean=mean_array[idx_el,:],cov_mat=covs[idx_el])
for idx_el in range(0,L)
]
prob=sum(list_gmm)## sometimes instead of 1./num_data , literature uses the factor 1./(bandwidth X num_data)
return prob
def EM_training(mean:np.array,mixing_coeffs:np.array,covs:Dict, dataset_input:np.array,num_iters:int=500,tol:float=0.0001):
'''
Args:
mean= a numpy array
shape(mean)= (L X dimension_of_the_problem_instance (whether 1D, 2D, 3D and so on))
mixing_coeffs=
shape(mixing_coeffs)= (1 X L)
covs= dict of covariance matrices.
shape(covs)= L number of entries, one for each of the P_l(x)
dataset_input: shape would be (num of data set X dimn_of_the_prob)
'''
L=np.shape(mean)[0]
print("L is:");print(L)
dimn_of_prob=np.shape(dataset_input)[1]
# define gamma: responsibility
num_data=np.shape(dataset_input)[0] ### note shape(dataset_input)=(num data X dimn_of_the_problem)
gamma=np.zeros(shape=(num_data,L))
prev_log_likelihood = -np.inf # Initialize the log-likelihood to negative infinity
###
for t in range(0,num_iters):
### evaluate the E step
# update the gamma.
for idx_over_data in range(0,num_data):
prod_mixing_coeffs_and_gaussian_l=[mixing_coeffs[0,idx_el]*gaussian_multivar(x=dataset_input[idx_over_data,:].reshape((1,dimn_of_prob)),mean=mean[idx_el,:].reshape((1,dimn_of_prob)),cov_mat=covs[idx_el]) for idx_el in range(0,L)]
Dr_for_gamma=np.sum(prod_mixing_coeffs_and_gaussian_l)
gamma[idx_over_data,:]=[prod_mixing_coeffs_and_gaussian_l[k]/Dr_for_gamma for k in range(0,L)]
#print("gamma obtained:");print(gamma)
### M-step: Re-estimating the parameters
N_l=np.sum(gamma,axis=0)# this is the expression sum_{i=1}^{N(all dataset)} gamma_{il} in the lecture notes
#print("N_l is:");print(N_l)# NOTE:: shape of N_l = (L,) (note that this array is 1D)
### 1. update means: the expresion is like a wtd sum of the vectors
for idx_el in range(0,L):
#print("index el is:");print(idx_el)
temp_vec=np.array(
[gamma[idx_over_data,idx_el]*dataset_input[idx_over_data,:]
for idx_over_data in range(0,num_data)]
)
mean[idx_el,:]=(1./N_l[idx_el])*np.sum(temp_vec,axis=0)
#print("mean matrix after update:");print(mean)
### 2. update the covariance matrix:
##### formula for updating the cov mat is:
##### sigma_lth=1/N_l[idx_el] * sum_{i=1}^{N(all_dataset)} * gamma_{il}*some_outer_product
for idx_el in range(0,L):
#print("updating cov matrix, index el is:",idx_el)
temp_outer_prod_list=np.array(
[gamma[idx_over_data,idx_el]*(dataset_input[idx_over_data,:]-mean[idx_el,:]).reshape((dimn_of_prob,1)) @
(dataset_input[idx_over_data,:]-mean[idx_el,:]).reshape((1,dimn_of_prob))
for idx_over_data in range(0,num_data)
]
)
covs[idx_el]=(1./N_l[idx_el])*sum(temp_outer_prod_list)
#print("updated covriances");print(covs)
### 3. update the mixing coeffs:
for idx_el in range(0,L):
mixing_coeffs[0,idx_el]=N_l[idx_el]/num_data
# Calculate the log-likelihood for the current parameters
current_log_likelihood = 0
for idx_over_data in range(num_data):
log_sum = np.log(sum([mixing_coeffs[0, idx_el] * gaussian_multivar(x=dataset_input[idx_over_data, :].reshape((1, dimn_of_prob)), mean=mean[idx_el, :].reshape((1, dimn_of_prob)), cov_mat=covs[idx_el]) for idx_el in range(L)]))
current_log_likelihood += log_sum
# Check for convergence
if np.abs(current_log_likelihood - prev_log_likelihood) < tol:
break
prev_log_likelihood = current_log_likelihood
return mean,covs,mixing_coeffs
def fit_the_GMM(X, L):
# Define your Gaussian function for multivariate data
def gaussian_multivar(x, mean, cov_mat):
dimn = np.shape(x)[1]
inv_cov = np.linalg.inv(cov_mat)
x_minus_mu = x - mean
arg_for_expn = 0.5 * np.sum(x_minus_mu @ inv_cov * x_minus_mu, axis=1)
det_cov = np.linalg.det(cov_mat)
prob = np.exp(-arg_for_expn) / (np.sqrt(det_cov) * ((2 * np.pi) ** (dimn / 2)))
return prob
dimn_of_prob = X.shape[1]
# Initial guess for GMM parameters
initial_means = np.random.rand(L, dimn_of_prob)
initial_covs = [np.eye(dimn_of_prob) for _ in range(L)]
initial_mixing_coeffs = np.random.rand(1, L)
initial_mixing_coeffs /= np.sum(initial_mixing_coeffs) # Normalize
# Create a dictionary to store covariance matrices
covariance_matrices = {i: initial_covs[i] for i in range(L)}
# Train the GMM model using the EM algorithm
trained_means, trained_covs, trained_mixing_coeffs = EM_training(
mean=initial_means,
mixing_coeffs=initial_mixing_coeffs,
covs=covariance_matrices,
dataset_input=X,
num_iters=100,
tol=0.0001
)
print("trained means are:"); print(trained_means)
print("trained_covs are:");print(trained_covs)
print("trained_mixing_coeffs are:");print(trained_mixing_coeffs)
# Plot the original data
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c='blue', label='make_moons dataset', alpha=0.5)
# Plot the GMM model
###############################
x, y = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 100), np.linspace(X[:, 1].min(), X[:, 1].max(), 100))
xy = np.column_stack([x.ravel(), y.ravel()])
pdf_values = np.zeros_like(x)
for idx in range(L):
#component_pdf = trained_mixing_coeffs[0, idx] * multivariate_normal.pdf(xy, trained_means[idx, :], trained_covs[idx])
component_pdf = trained_mixing_coeffs[0, idx] * gaussian_multivar(xy, trained_means[idx, :], trained_covs[idx])
pdf_values += component_pdf.reshape(x.shape)
# Create a contour plot for the GMM model with a color bar
contour = plt.contourf(x, y, pdf_values, levels=10, cmap='viridis', alpha=0.5)
colorbar = plt.colorbar(contour, label='Probability Density')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.title('GMM Model and make_moons Dataset')
plt.show()
#plot_contour_plot_gmm(X,trained_mixing_coeffs,trained_means,trained_covs,L)
return trained_means, trained_covs, trained_mixing_coeffs
In a Gaussian Mixture Model (GMM) the multiple Gaussian components represents a cluster or mode in your data. The "mixing coefficients" indicate the probability of choosing each component when generating data:
Step 1. Component Selection: First, one needs to select one of the Gaussian components to generate a data point. We do this by sampling from the mixing coefficients. For example, if we have two components with mixing coefficients [0.4, 0.6], we'd choose the first component with a 40% chance and the second component with a 60% chance.
Step.2. Sample from the Selected Component: Once we've selected a component, we sample a data point from that component's Gaussian distribution. This is done by generating random numbers according to the mean and covariance matrix of the selected component's Gaussian distribution.
Step 3: Repeat: We repeat these steps to generate as many data points as you need.
# sample the data_points from the GMM model
def sample_from_gmm(num_samples, mixing_coeffs, mean_array, covs,X, want_scatter_plot=False):
from scipy.stats import multivariate_normal
samples = []
for _ in range(num_samples):
# Sample a component based on mixing coefficients
component = np.random.choice(len(mixing_coeffs[0]), p=mixing_coeffs[0])
# Sample from the selected component
sample = np.random.multivariate_normal(mean_array[component, :], covs[component])
samples.append(sample)
samples=np.array(samples)
if want_scatter_plot==True:
plt.figure(2)
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c='blue', label='make_moons dataset', alpha=0.5)
plt.scatter(samples[:, 0], samples[:, 1], c='red', label='Generated Data', alpha=0.5)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.title('Generated Data from GMM Model')
plt.show()
return samples
However something that we should have added to our implementation is the regularisation term (the sample solution does it, they define it as epsilon) which is used when updating the covairance matrices in the M-step of the E-M algorithm. This regularisation term is added to covariance matrices to ensure that the matrix remains positive definite.
Vectorization: We could have used NumPy functions for operations on arrays instead of explicit loops where possible. This can significantly speed up calculations.# Instead of using explicit loops
temp_vec = np.array([gamma[idx_over_data, idx_el] * dataset_input[idx_over_data, :] for idx_over_data in range(0, num_data)])
mean[idx_el, :] = (1. / N_l[idx_el]) * np.sum(temp_vec, axis=0)
# Use NumPy functions for vectorized operations
temp_vec = gamma[:, idx_el][:, np.newaxis] * dataset_input
mean[idx_el, :] = np.sum(temp_vec, axis=0) / N_l[idx_el]
Avoid list comprehension for large arrays: In our list comprehensions, we are creating lists and then summing them up. This can be memory-intensive for large datasets. Instead, we could have used the array operations directly.# Instead of list comprehensions
temp_outer_prod_list = np.array([gamma[idx_over_data, idx_el] * (dataset_input[idx_over_data, :] - mean[idx_el, :]).reshape((dimn_of_prob, 1)) @
(dataset_input[idx_over_data, :] - mean[idx_el, :]).reshape((1, dimn_of_prob)) for idx_over_data in range(0, num_data)])
# Use array operations directly
temp_diff = dataset_input - mean[idx_el, :]
temp_outer_prod_list = gamma[:, idx_el][:, np.newaxis, np.newaxis] * np.einsum('ij,ik->ijk', temp_diff, temp_diff)
covs[idx_el] = np.sum(temp_outer_prod_list, axis=0) / N_l[idx_el]
list(datasets.keys())
['Small', 'Medium', 'Large']
### fit the GMM model to the data and then sample
dataset_for_GMM={}; keys_dataset=list(datasets.keys())
for data_size in keys_dataset:
print("----------------DATASIZE:",data_size,"------------------")
print("_________________________________________________________")
X=datasets[data_size];temp={}
for idx_el in range(1,7):
print("index_el:",idx_el)
trained_means, trained_covs, trained_mixing_coeffs=fit_the_GMM(X=X, L=idx_el)
generated_samples=sample_from_gmm(num_samples=X.shape[0],mixing_coeffs=trained_mixing_coeffs,
mean_array=trained_means,
covs=trained_covs,
X=X, want_scatter_plot=True)
temp[idx_el]=generated_samples
dataset_for_GMM[data_size]=temp
----------------DATASIZE: Small ------------------
_________________________________________________________
index_el: 1
L is:
1
trained means are:
[[0.50006995 0.2464502 ]]
trained_covs are:
{0: array([[ 0.753472 , -0.17935397],
[-0.17935397, 0.24466648]])}
trained_mixing_coeffs are:
[[1.]]
<Figure size 640x480 with 0 Axes>
index_el: 2
L is:
2
trained means are:
[[ 0.80769015 0.17751047]
[-0.74562807 0.52561939]]
trained_covs are:
{0: array([[ 0.44948797, -0.13198018],
[-0.13198018, 0.25881563]]), 1: array([[0.04948005, 0.06244601],
[0.06244601, 0.09018868]])}
trained_mixing_coeffs are:
[[0.80195932 0.19804068]]
<Figure size 640x480 with 0 Axes>
index_el: 3
L is:
3
trained means are:
[[ 0.71459993 0.281603 ]
[ 0.61840201 0.75452719]
[ 0.40285576 -0.01647488]]
trained_covs are:
{0: array([[ 0.91569742, -0.24268738],
[-0.24268738, 0.1313912 ]]), 1: array([[ 0.60645975, -0.18408576],
[-0.18408576, 0.06919984]]), 2: array([[ 0.77493737, -0.22318156],
[-0.22318156, 0.15328593]])}
trained_mixing_coeffs are:
[[0.10380025 0.30088703 0.59531273]]
<Figure size 640x480 with 0 Axes>
index_el: 4
L is:
4
trained means are:
[[-0.74373538 0.52724569]
[ 0.49212539 -0.00998175]
[ 0.78034677 0.723156 ]
[ 1.50025431 -0.31479398]]
trained_covs are:
{0: array([[0.04925169, 0.06251597],
[0.06251597, 0.09093562]]), 1: array([[ 0.14582385, -0.03125686],
[-0.03125686, 0.12055593]]), 2: array([[ 0.57467376, -0.19671174],
[-0.19671174, 0.07751427]]), 3: array([[0.08423101, 0.04730756],
[0.04730756, 0.03251709]])}
trained_mixing_coeffs are:
[[0.2055128 0.33860229 0.27456441 0.18132049]]
<Figure size 640x480 with 0 Axes>
index_el: 5
L is:
5
trained means are:
[[ 0.69253483 -0.09092805]
[ 0.4064456 0.57350401]
[ 1.81651015 0.01400419]
[-0.70462613 0.60873619]
[-0.57022529 0.47183015]]
trained_covs are:
{0: array([[ 0.1882859 , -0.07826564],
[-0.07826564, 0.18301614]]), 1: array([[ 0.11998602, -0.02308836],
[-0.02308836, 0.17391539]]), 2: array([[0.02825188, 0.04299729],
[0.04299729, 0.07831436]]), 3: array([[0.05003109, 0.05501902],
[0.05501902, 0.06405516]]), 4: array([[0.18140737, 0.19641821],
[0.19641821, 0.21639144]])}
trained_mixing_coeffs are:
[[0.34796375 0.24753802 0.17313393 0.17922193 0.05214237]]
<Figure size 640x480 with 0 Axes>
index_el: 6
L is:
6
trained means are:
[[-0.73627212 0.53576089]
[ 0.04559832 0.22281345]
[ 0.65519663 0.6311833 ]
[ 1.78714662 -0.01732783]
[ 0.71546059 -0.38751407]
[-0.10457785 0.97105925]]
trained_covs are:
{0: array([[0.05094303, 0.06435182],
[0.06435182, 0.09268741]]), 1: array([[ 0.00268605, -0.00784069],
[-0.00784069, 0.03525823]]), 2: array([[ 0.09458215, -0.09117792],
[-0.09117792, 0.10940841]]), 3: array([[0.03685224, 0.0502607 ],
[0.0502607 , 0.08314315]]), 4: array([[ 0.12809965, -0.04311216],
[-0.04311216, 0.01943504]]), 5: array([[0.0019856 , 0.00160915],
[0.00160915, 0.00448503]])}
trained_mixing_coeffs are:
[[0.21003668 0.09719611 0.2520742 0.18888608 0.21205136 0.03975557]]
<Figure size 640x480 with 0 Axes>
----------------DATASIZE: Medium ------------------
_________________________________________________________
index_el: 1
L is:
1
trained means are:
[[0.49985466 0.24700133]]
trained_covs are:
{0: array([[ 0.74904345, -0.19119175],
[-0.19119175, 0.2433779 ]])}
trained_mixing_coeffs are:
[[1.]]
<Figure size 640x480 with 0 Axes>
index_el: 2
L is:
2
trained means are:
[[0.44917222 0.04491844]
[0.62286977 0.73749168]]
trained_covs are:
{0: array([[ 0.81234166, -0.228497 ],
[-0.228497 , 0.17381113]]), 1: array([[ 0.57404025, -0.18584238],
[-0.18584238, 0.07252806]])}
trained_mixing_coeffs are:
[[0.7082144 0.2917856]]
<Figure size 640x480 with 0 Axes>
index_el: 3
L is:
3
trained means are:
[[-0.7737834 0.51975744]
[ 1.76511022 -0.04300051]
[ 0.49330375 0.25437384]]
trained_covs are:
{0: array([[0.040528 , 0.05267169],
[0.05267169, 0.08300167]]), 1: array([[0.04383146, 0.0543914 ],
[0.0543914 , 0.08102181]]), 2: array([[ 0.18834427, -0.12138886],
[-0.12138886, 0.29419402]])}
trained_mixing_coeffs are:
[[0.18909945 0.19354863 0.61735192]]
<Figure size 640x480 with 0 Axes>
index_el: 4
L is:
4
trained means are:
[[ 0.48864373 -0.18266959]
[-0.69458279 0.58416462]
[ 1.75212006 -0.0556088 ]
[ 0.57128862 0.66601923]]
trained_covs are:
{0: array([[ 0.17184121, -0.11440647],
[-0.11440647, 0.10408415]]), 1: array([[0.06983778, 0.07260714],
[0.07260714, 0.09321453]]), 2: array([[0.04731459, 0.05721665],
[0.05721665, 0.08278782]]), 3: array([[ 0.13389047, -0.10183705],
[-0.10183705, 0.09826414]])}
trained_mixing_coeffs are:
[[0.30155874 0.22387285 0.20060659 0.27396182]]
<Figure size 640x480 with 0 Axes>
index_el: 5
L is:
5
trained means are:
[[ 0.8985737 -0.45494526]
[ 1.00047115 0.27866819]
[ 0.26990034 0.8767014 ]
[-0.77334284 0.52525856]
[ 1.00370896 -0.17079393]]
trained_covs are:
{0: array([[ 0.09583784, -0.01040648],
[-0.01040648, 0.00459743]]), 1: array([[ 0.61772248, -0.02023903],
[-0.02023903, 0.02871827]]), 2: array([[ 0.12956334, -0.03827076],
[-0.03827076, 0.01906512]]), 3: array([[0.03871495, 0.05015285],
[0.05015285, 0.0782377 ]]), 4: array([[ 0.50148919, -0.03404347],
[-0.03404347, 0.02566867]])}
trained_mixing_coeffs are:
[[0.15765052 0.24341738 0.21396886 0.18540784 0.1995554 ]]
<Figure size 640x480 with 0 Axes>
index_el: 6
L is:
6
trained means are:
[[-0.79592265 0.49642325]
[ 0.82699921 0.4705127 ]
[ 0.04192508 0.95596168]
[ 1.81433009 0.00769911]
[ 0.9882576 -0.46320741]
[ 0.18051161 0.02682711]]
trained_covs are:
{0: array([[0.03271249, 0.04497008],
[0.04497008, 0.07611623]]), 1: array([[ 0.02727174, -0.03769265],
[-0.03769265, 0.06601615]]), 2: array([[ 0.0738626 , -0.00207535],
[-0.00207535, 0.00401852]]), 3: array([[0.02719964, 0.038613 ],
[0.038613 , 0.06770735]]), 4: array([[ 0.08176539, -0.00181037],
[-0.00181037, 0.00338919]]), 5: array([[ 0.02878055, -0.04141316],
[-0.04141316, 0.07297738]])}
trained_mixing_coeffs are:
[[0.18017456 0.16739798 0.15220133 0.17229991 0.15677852 0.17114771]]
<Figure size 640x480 with 0 Axes>
----------------DATASIZE: Large ------------------
_________________________________________________________
index_el: 1
L is:
1
trained means are:
[[0.5006108 0.24937081]]
trained_covs are:
{0: array([[ 0.75409776, -0.19372119],
[-0.19372119, 0.24760576]])}
trained_mixing_coeffs are:
[[1.]]
<Figure size 640x480 with 0 Axes>
index_el: 2
L is:
2
trained means are:
[[ 0.62829721 0.09056855]
[-0.0507244 0.93506067]]
trained_covs are:
{0: array([[ 0.81228134, -0.13225792],
[-0.13225792, 0.16964158]]), 1: array([[0.12849924, 0.00648573],
[0.00648573, 0.00518643]])}
trained_mixing_coeffs are:
[[0.81195531 0.18804469]]
<Figure size 640x480 with 0 Axes>
index_el: 3
L is:
3
trained means are:
[[ 1.74385897 -0.06299622]
[-0.76689092 0.54119564]
[ 0.48653496 0.26148846]]
trained_covs are:
{0: array([[0.05194519, 0.06166023],
[0.06166023, 0.09103468]]), 1: array([[0.04516723, 0.05632036],
[0.05632036, 0.08569044]]), 2: array([[ 0.17811734, -0.10976195],
[-0.10976195, 0.29223171]])}
trained_mixing_coeffs are:
[[0.20444224 0.19384816 0.60170959]]
<Figure size 640x480 with 0 Axes>
index_el: 4
L is:
4
trained means are:
[[ 1.81235615 0.00383034]
[ 0.21044536 0.12828942]
[ 0.77699853 0.37363025]
[-0.81836973 0.48956207]]
trained_covs are:
{0: array([[0.0286931 , 0.04150261],
[0.04150261, 0.07661638]]), 1: array([[ 0.14192445, -0.18107452],
[-0.18107452, 0.28678854]]), 2: array([[ 0.14993125, -0.18741391],
[-0.18741391, 0.29675392]]), 3: array([[0.02890473, 0.04178497],
[0.04178497, 0.07497171]])}
trained_mixing_coeffs are:
[[0.17381666 0.32509748 0.33048881 0.17059706]]
<Figure size 640x480 with 0 Axes>
index_el: 5
L is:
5
trained means are:
[[ 1.81236186 0.00516478]
[ 0.619187 0.64041602]
[-0.76123566 0.54781842]
[ 0.08324945 0.24193606]
[ 0.93948828 -0.45298527]]
trained_covs are:
{0: array([[0.02838007, 0.04070096],
[0.04070096, 0.07516323]]), 1: array([[ 0.11794533, -0.10043192],
[-0.10043192, 0.10747158]]), 2: array([[0.04568965, 0.0567913 ],
[0.0567913 , 0.08589983]]), 3: array([[ 0.03743876, -0.07661123],
[-0.07661123, 0.19122726]]), 4: array([[ 0.09528433, -0.00751979],
[-0.00751979, 0.00424314]])}
trained_mixing_coeffs are:
[[0.17411171 0.26193946 0.19831888 0.19815708 0.16747287]]
<Figure size 640x480 with 0 Axes>
index_el: 6
L is:
6
trained means are:
[[ 0.26756185 -0.07134769]
[-0.86519315 0.42787887]
[ 1.68143494 -0.11412851]
[ 0.94456531 -0.05292318]
[ 0.58870719 0.76962196]
[-0.19904052 0.94382425]]
trained_covs are:
{0: array([[ 0.05511159, -0.06300095],
[-0.06300095, 0.0884764 ]]), 1: array([[0.01717837, 0.02795799],
[0.02795799, 0.06008518]]), 2: array([[0.0742036 , 0.07681313],
[0.07681313, 0.09843615]]), 3: array([[0.0076563 , 0.00114038],
[0.00114038, 0.14426678]]), 4: array([[ 0.04412181, -0.03133136],
[-0.03133136, 0.02659675]]), 5: array([[0.06481334, 0.01158746],
[0.01158746, 0.00533173]])}
trained_mixing_coeffs are:
[[0.21219401 0.14627734 0.23401158 0.13300356 0.13446874 0.14004476]]
<Figure size 640x480 with 0 Axes>
Both sample code and our code implements the similar concepts, hence we believe that they are essentially same and should generate similar results.
The sample solution considers L=20 directly for fitting the gaussian distribution hence it obviously gives a better answer than ours (we expect similar result for L=20 in our case). However in our implementaiton, my taking values of L=1,2,3,4,5,6, we have explicitly shown, both via contour plots (this is missing in sample soln) as well as through MMD_squared caln which we have carried out below (this sample solution also shows), how increasing the value of L (having more mixer terms) improves the model.
# with open('GMM_dataset_dictionary.pkl', 'wb') as f:
# pickle.dump(dataset_for_GMM, f)
Observations:
For the given data-set, increasing the number of mixers in our GMM model helps in learning the distribution, something we would expect as the dataset is clearly not uni-modal so to speak.
As can clearly be seen, for a particular L (number of mixer components in the GMM), more the number of dataset, better is the estimate of the probability. With larger dataset, a GMM model of low value of L (less number of mixtures) is able to model the distribution as good as a GMM model with large l trained over low number of data-points.
This can also be seen more explicitly from the plot of $mmd^{2}$ for different L for each dataset of different sizes ('small':100 data points, 'medium':500 data points, 'high':1000 datapoints) plotted below!
Note that the kernel density estimation model is a non-parametric model, hence it does not require any "training" so to speak!
discusses how to choose an appropriate bandwidth parameter: https://stats.libretexts.org/Bookshelves/Computing_and_Modeling/Supplemental_Modules_(Computing_and_Modeling)/Regression_Analysis/Nonparametric_Inference_-_Kernel_Density_Estimation#:~:text=Contributors-,Introduction%20and%20definition,%E2%88%88%CE%98%E2%8A%82Rd%7D.
Squared exponential kernel:
https://www.mathworks.com/help/stats/kernel-covariance-function-options.html
def squared_exp_kernel(xa:np.array, xb:np.array,bandwidth:float,amplitude:float=1):
'''
xa=row vector of shape (1xdimn_of_the_prob)
xb= row vector of shape (1X dimn_of_the_prob)
bandwidth: a hyperparameter
amplitude: overall variance (set to 1 in the lecture, can be changed as well)
'''
diff=xa-xb
kernel=(amplitude**2) * np.exp( -np.dot(diff,diff)/(2*bandwidth) )### what we are calling bandwidth is also refered to as sigma^2 is some of the literature. Here we have followed the notation followed in the lecture
return kernel
def inverse_multi_quadratic_kernel(xa:np.array, xb:np.array,bandwidth:float):
'''
xa=row vector of shape (1xdimn_of_the_prob)
xb= row vector of shape (1X dimn_of_the_prob)
bandwidth: a hyperparameter
'''
diff=xa-xb
kernel=bandwidth/(bandwidth+ ((norm(diff))**2) )
return kernel
def kernel_density_estimator(x:np.array, dataset:np.array, bandwidth:float, kernel_fn):
'''
x: a row vector (1X dimn of the problem) at which one wants to evaluate the KDE (prob function)
dataset= an array of shape (N X dimn of the problem)
bandwidth: hyperparameter
kernel_fn: choice of the user
'''
num_data=np.shape(dataset)[0]
list_kernel=[kernel_fn(xa=x,xb=dataset[j,:],bandwidth=bandwidth) for j in range(0,num_data)]
prob=(1./num_data)*sum(list_kernel)## sometimes instead of 1./num_data , we also have 1./(bandwidth X num_data)
return prob
def estimate_bandwidth_with_silverman_rule(data):
# Step 1: Calculate the standard deviation
std_dev = np.std(data)
# Step 2: Calculate the interquartile range (IQR)
Q1 = np.percentile(data, 25, axis=0)
Q3 = np.percentile(data, 75, axis=0)
IQR = Q3 - Q1
# Step 3: Estimate the bandwidth using Silverman's rule of thumb
n = len(data)
bandwidth = 0.9 * min(std_dev, np.mean(IQR) / 1.34) * n ** (-1 / 5)
return bandwidth
# contour plot kde
def contour_plot_kde(X,bandwidth,xmin=-2,xmax=2,ymin=-1.5,ymax=1.5):
# # Create a grid of points within the specified range
x_grid = np.linspace(xmin, xmax, 100)
y_grid = np.linspace(ymin, ymax, 100)
X_grid, Y_grid = np.meshgrid(x_grid, y_grid)
points = np.c_[X_grid.ravel(), Y_grid.ravel()]
# Evaluate the KDE at each point in the grid
kde_values = np.array([kernel_density_estimator(point, X, bandwidth,kernel_fn=squared_exp_kernel) for point in points])
kde_values = kde_values.reshape(X_grid.shape)
# Create a contour plot
plt.figure(figsize=(8, 6))
plt.contourf(X_grid, Y_grid, kde_values, levels=100, cmap='viridis')
plt.scatter(X[:, 0], X[:, 1], c='blue', label='make_moons dataset', alpha=0.5)
plt.colorbar(label='Probability Density')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Scatter Plot for data and Contour Plot of KDE Probability Density')
plt.legend()
plt.show()
# contour plot for distribution from the KDE
# xmin, xmax = -1, 2 # Adjust the range as needed
# ymin, ymax = -1, 1 # Adjust the range as needed
# # bandwidth = estimate_bandwidth_with_silverman_rule(X)
# # print("Estimated bandwidth using silverman rule:", bandwidth)
# contour_plot_kde(X,bandwidth=0.1,xmin=-1,xmax=2,ymin=-1,ymax=1)
### Sampling from the KDE
def sample_from_kde(data, bandwidth, num_samples,want_scatter_plot=False):
samples = []
for _ in range(num_samples):
# Randomly select a data point
data_point = data[np.random.randint(len(data))]
# Generate random perturbations based on the Gaussian kernel
perturbations = np.random.normal(0, bandwidth, size=data_point.shape)
# Add the perturbations to the selected data point
sample = data_point + perturbations
samples.append(sample)
samples=np.array(samples)
if want_scatter_plot==True:
# Sample from the KDE
# visualise the original and the generated data
plt.figure(figsize=(8,6))
plt.scatter(X[:, 0], X[:, 1,], c='b', label='Original Data')
plt.scatter(samples[:, 0], samples[:, 1,], c='r', label='Generated Data')
plt.legend()
plt.tight_layout()
plt.show()
return samples
Implementation of concept: Both sample solution and our solution have implemented the concept of the KDE correctly. However our code provides more flexibility by allowing us to choose different kernel function, hence providing the KDE to be tailored to specific characterisitcs of the data. As far as sampling is concerned, both codes implements the same strategy of "introducing the random perturbations to a randomlyselected data point from the original dataset".
We can make following improvements to make our code fast:
2.1. Vectorization: We could have utilized the NumPy vectorized operations instead of explicit loops as they are usually more efficient.
# Instead of looping through data points
for i in range(num_samples):
data_point = data[np.random.randint(len(data))]
perturbations = np.random.normal(0, bandwidth, size=data_point.shape)
sample = data_point + perturbations
# Use vectorized operations
data_points = data[np.random.randint(len(data), size=num_samples)]
perturbations = np.random.normal(0, bandwidth, size=(num_samples, data.shape[1]))
samples = data_points + perturbations
2.2 Precompute Random Perturbations: If the same perturbations are used for all samples, precompute them to avoid redundant computations inside the loop.
# Instead of
for i in range(num_samples):
data_point = data[np.random.randint(len(data))]
perturbations = np.random.normal(0, bandwidth, size=data_point.shape)
sample = data_point + perturbations
# Use Precomputed perturbations
perturbations = np.random.normal(0, bandwidth, size=(num_samples, data.shape[1]))
# Inside the loop
for i in range(num_samples):
data_point = data[np.random.randint(len(data))]
sample = data_point + perturbations[i]
We believe our solutions (the cases (hyperparameter) that we considered to explore the problem) provides more insight into how the KDE behaves. We have considered both the appropriate value of bandwidth (around 0.1) as well as others. We have also explicitly shown through contour plots how the prob distn generated by the KDE behaves for different values of the bandwidth (this is not done in the sample solution).
dataset_kde={};bandwidth=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4];keys_dataset=list(datasets.keys())
keys_dataset=list(datasets.keys())
for data_size in keys_dataset:
print("--------data_size:",data_size)
X=datasets[data_size];temp={}
for bw in bandwidth:
print("bandwidth:",bw)
contour_plot_kde(X,bandwidth=bw,xmin=-2,xmax=2.5,ymin=-1.5,ymax=1.5)
generated_data_KDE=sample_from_kde(data=X,bandwidth=bw,num_samples=X.shape[0],want_scatter_plot=True)
temp[bw]=generated_data_KDE
dataset_kde[data_size]=temp
--------data_size: Small bandwidth: 0.01
bandwidth: 0.05
bandwidth: 0.1
bandwidth: 0.15
bandwidth: 0.2
bandwidth: 0.25
bandwidth: 0.3
bandwidth: 0.35
bandwidth: 0.4
--------data_size: Medium bandwidth: 0.01
bandwidth: 0.05
bandwidth: 0.1
bandwidth: 0.15
bandwidth: 0.2
bandwidth: 0.25
bandwidth: 0.3
bandwidth: 0.35
bandwidth: 0.4
--------data_size: Large bandwidth: 0.01
bandwidth: 0.05
bandwidth: 0.1
bandwidth: 0.15
bandwidth: 0.2
bandwidth: 0.25
bandwidth: 0.3
bandwidth: 0.35
bandwidth: 0.4
# with open('KDE_dataset_dictionary_smaller_bw_range.pkl', 'wb') as f:
# pickle.dump(dataset_kde, f)
a good resource for MMD: https://www.onurtunali.com/ml/2019/03/08/maximum-mean-discrepancy-in-machine-learning.html
import numpy as np
def estimate_mmd_squared(samples1, samples2, kernel_function, bandwidth=1.0):
"""
Estimate the Maximum Mean Discrepancy (MMD) squared between two sets of samples using a kernel function.
Args:
- samples1: the first set of samples.
A numpy array of shape (n_samples1, n_features) representing .
- samples2: the second set of samples.
A numpy array of shape (n_samples2, n_features) representing .
- kernel_function: A function that computes the kernel value between two samples.
- bandwidth: Bandwidth parameter for the kernel function.
Returns:
- mmd_squared: The estimated MMD squared.
"""
num_samples1 = samples1.shape[0]
num_samples2 = samples2.shape[0]
# Compute the MMD squared statistic
mmd_squared1,mmd_squared2,mmd_squared12 = 0.0,0.0,0.0
for i in range(num_samples1):
for j in range(num_samples1):
mmd_squared1 += kernel_function(samples1[i], samples1[j], bandwidth)
mmd_squared1 *= 1.0 / (num_samples1 * (num_samples1 - 1))
for i in range(num_samples2):
for j in range(num_samples2):
mmd_squared2 += kernel_function(samples2[i], samples2[j], bandwidth)
mmd_squared2 *= 1.0 / (num_samples2 * (num_samples2 - 1))
for i in range(num_samples1):
for j in range(num_samples2):
mmd_squared12 -= 2.0 * kernel_function(samples1[i], samples2[j], bandwidth)
mmd_squared12 *= 1.0 / (num_samples1 * (num_samples2))
mmd_squared=mmd_squared1+mmd_squared2+mmd_squared12
return mmd_squared
def squared_exp_kernel(xa, xb, bandwidth, amplitude=1.0):
"""
Squared Exponential (RBF) kernel function.
Args:
- xa: A numpy array of shape (1, dimn_of_the_prob).
- xb: A numpy array of shape (1, dimn_of_the_prob).
- bandwidth: Bandwidth parameter.
- amplitude: Overall variance (default is 1).
Returns:
- kernel: The kernel value.
"""
diff = xa - xb
kernel = (amplitude ** 2) * np.exp(-np.dot(diff, diff) / (2 * bandwidth))
return kernel
def inverse_multi_quadratic_kernel(xa, xb, bandwidth):
"""
Inverse Multi-Quadratic kernel function.
Args:
- xa: A numpy array of shape (1, dimn_of_the_prob).
- xb: A numpy array of shape (1, dimn_of_the_prob).
- bandwidth: Bandwidth parameter.
Returns:
- kernel: The kernel value.
"""
diff = xa - xb
kernel = bandwidth / (bandwidth + np.sum(diff ** 2))
return kernel
def plot_dict_of_dicts(data, x_label='X-values', y_label='Y-values', title='Two-Line Point Plot'):
"""
Plot a dictionary of dictionaries as separate lines on a two-line point plot.
Args:
data (dict): A dictionary where keys represent labels and values are dictionaries with x and y values.
x_label (str): Label for the x-axis.
y_label (str): Label for the y-axis.
title (str): Title for the plot.
Returns:
None
"""
plt.figure(figsize=(8, 6))
for label, values in data.items():
x_values = list(values.keys())
y_values = list(values.values())
plt.plot(x_values, y_values,":" ,marker='o', label=label)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title)
plt.legend()
plt.grid(True)
plt.show()
size_dataset=["Small","Medium","Large"];mmd_gmm_model_sqd_exp_kernel={}
for data_size in size_dataset:
print("----------------DATASIZE:",data_size,"------------------")
X=datasets[data_size];
temp={}
for idx_el in range(1,7):
dataset_sampled_gmm=dataset_for_GMM[data_size][idx_el]
temp[idx_el]=estimate_mmd_squared(samples1=dataset_sampled_gmm,
samples2=X, kernel_function=squared_exp_kernel)
print("for L=",idx_el," MMD^2 score:");print(temp[idx_el])
mmd_gmm_model_sqd_exp_kernel[data_size]=temp
----------------DATASIZE: Small ------------------ for L= 1 MMD^2 score: 0.021462996536169454 for L= 2 MMD^2 score: 0.015153576109734779 for L= 3 MMD^2 score: 0.023083391802166675 for L= 4 MMD^2 score: 0.0157404628433937 for L= 5 MMD^2 score: 0.012968477216233731 for L= 6 MMD^2 score: 0.011881913431552427 ----------------DATASIZE: Medium ------------------ for L= 1 MMD^2 score: 0.00808000119872232 for L= 2 MMD^2 score: 0.007847838415810715 for L= 3 MMD^2 score: 0.002854229711848566 for L= 4 MMD^2 score: 0.0026202490229276787 for L= 5 MMD^2 score: 0.0030813828475186877 for L= 6 MMD^2 score: 0.0022849453894315808 ----------------DATASIZE: Large ------------------ for L= 1 MMD^2 score: 0.008141649846514776 for L= 2 MMD^2 score: 0.0035073972497046135 for L= 3 MMD^2 score: 0.0015671430299024625 for L= 4 MMD^2 score: 0.0019851960641910082 for L= 5 MMD^2 score: 0.0013715822944770917 for L= 6 MMD^2 score: 0.0015351823895199956
plot_dict_of_dicts(data=mmd_gmm_model_sqd_exp_kernel,
x_label="L values",
y_label="MMD_squared",
title="MMD_squared (with sqd_exp kernel) plot for GMM for dataset of different lengths ")
mmd_kde_model_sqd_exp_kernel={}
bandwidth_for_kde=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4];keys_dataset=list(datasets.keys())
size_dataset=["Small","Medium","Large"]
for data_size in size_dataset:
print("--------Data_size:",data_size,"-----------")
X=datasets[data_size];temp={}
for bw in bandwidth_for_kde:
dataset_sampled_kde=dataset_kde[data_size][bw]
temp[bw]=estimate_mmd_squared(samples1=dataset_sampled_kde,
samples2=X, kernel_function=squared_exp_kernel)
print("for bandwidth:",bw," MMD^2 score:",temp[bw])
mmd_kde_model_sqd_exp_kernel[data_size]=temp
--------Data_size: Small ----------- for bandwidth: 0.01 MMD^2 score: 0.0227324569272509 for bandwidth: 0.05 MMD^2 score: 0.014870248884561788 for bandwidth: 0.1 MMD^2 score: 0.0152579907442425 for bandwidth: 0.15 MMD^2 score: 0.013652325888171557 for bandwidth: 0.2 MMD^2 score: 0.014246454194056346 for bandwidth: 0.25 MMD^2 score: 0.013828091300571055 for bandwidth: 0.3 MMD^2 score: 0.018295668526015052 for bandwidth: 0.35 MMD^2 score: 0.013389361599205007 for bandwidth: 0.4 MMD^2 score: 0.01629857467187279 --------Data_size: Medium ----------- for bandwidth: 0.01 MMD^2 score: 0.004230828587042268 for bandwidth: 0.05 MMD^2 score: 0.00229779593652113 for bandwidth: 0.1 MMD^2 score: 0.0034189463359119987 for bandwidth: 0.15 MMD^2 score: 0.0030637120826850772 for bandwidth: 0.2 MMD^2 score: 0.0037768409473978304 for bandwidth: 0.25 MMD^2 score: 0.004494465353473864 for bandwidth: 0.3 MMD^2 score: 0.004621511270823331 for bandwidth: 0.35 MMD^2 score: 0.007253995535964286 for bandwidth: 0.4 MMD^2 score: 0.00628754405726728 --------Data_size: Large ----------- for bandwidth: 0.01 MMD^2 score: 0.0016516745432757496 for bandwidth: 0.05 MMD^2 score: 0.0028152656201276827 for bandwidth: 0.1 MMD^2 score: 0.001332365306855099 for bandwidth: 0.15 MMD^2 score: 0.0015215272280293402 for bandwidth: 0.2 MMD^2 score: 0.0026557089785370636 for bandwidth: 0.25 MMD^2 score: 0.002927438603620436 for bandwidth: 0.3 MMD^2 score: 0.003200250134513638 for bandwidth: 0.35 MMD^2 score: 0.004580846148862894 for bandwidth: 0.4 MMD^2 score: 0.006475937707462465
plot_dict_of_dicts(data=mmd_kde_model_sqd_exp_kernel,
x_label="bandwidth (h) for KDE",
y_label="MMD_squared",
title="MMD_squared (with sqd_exp kernel) plot for KDE for dataset of different lengths ")
for GMM
size_dataset=["Small","Medium","Large"];mmd_gmm_model_inv_quad={}
for data_size in size_dataset:
print("----------------DATASIZE:",data_size,"------------------")
X=datasets[data_size];
temp={}
for idx_el in range(1,7):
print("L:",idx_el)
dataset_sampled_gmm=dataset_for_GMM[data_size][idx_el]
temp[idx_el]=estimate_mmd_squared(samples1=dataset_sampled_gmm,
samples2=X, kernel_function=inverse_multi_quadratic_kernel)
print("For L=",idx_el," MMD^2 score:",temp[idx_el])
mmd_gmm_model_inv_quad[data_size]=temp
----------------DATASIZE: Small ------------------ L: 1 For L= 1 MMD^2 score: 0.02522100706992847 L: 2 For L= 2 MMD^2 score: 0.019936303817976953 L: 3 For L= 3 MMD^2 score: 0.02869786973074495 L: 4 For L= 4 MMD^2 score: 0.015868479819498127 L: 5 For L= 5 MMD^2 score: 0.01374438812674006 L: 6 For L= 6 MMD^2 score: 0.012066812679013106 ----------------DATASIZE: Medium ------------------ L: 1 For L= 1 MMD^2 score: 0.013191802458515478 L: 2 For L= 2 MMD^2 score: 0.012626625454044249 L: 3 For L= 3 MMD^2 score: 0.0049995159821693 L: 4 For L= 4 MMD^2 score: 0.0032303076786441842 L: 5 For L= 5 MMD^2 score: 0.005232252075411892 L: 6 For L= 6 MMD^2 score: 0.002370310013506516 ----------------DATASIZE: Large ------------------ L: 1 For L= 1 MMD^2 score: 0.01372832858219697 L: 2 For L= 2 MMD^2 score: 0.00682231846961745 L: 3 For L= 3 MMD^2 score: 0.003644204407087237 L: 4 For L= 4 MMD^2 score: 0.004022457051592765 L: 5 For L= 5 MMD^2 score: 0.0018976849608979274 L: 6 For L= 6 MMD^2 score: 0.0017011299403707492
plot_dict_of_dicts(data=mmd_gmm_model_inv_quad,
x_label="L values",
y_label="MMD_squared",
title="MMD_squared (with inv-multi-quad kernel) plot for GMM for dataset of different lengths")
for KDE:
mmd_kde_model_inv_quad={}
bandwidth_for_kde=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4];keys_dataset=list(datasets.keys())
size_dataset=["Small","Medium","Large"]
for data_size in size_dataset:
print("--------Data_size:",data_size,"-----------")
X=datasets[data_size];temp={}
for bw in bandwidth_for_kde:
dataset_sampled_kde=dataset_kde[data_size][bw]
temp[bw]=estimate_mmd_squared(samples1=dataset_sampled_kde,
samples2=X, kernel_function=inverse_multi_quadratic_kernel)
print("for bandwidth:",bw," MMD^2 score:",temp[bw])
mmd_kde_model_inv_quad[data_size]=temp
--------Data_size: Small ----------- for bandwidth: 0.01 MMD^2 score: 0.021025600080310625 for bandwidth: 0.05 MMD^2 score: 0.01418040320122893 for bandwidth: 0.1 MMD^2 score: 0.01425900388764445 for bandwidth: 0.15 MMD^2 score: 0.013629291631276352 for bandwidth: 0.2 MMD^2 score: 0.014214256636971845 for bandwidth: 0.25 MMD^2 score: 0.015443687141364015 for bandwidth: 0.3 MMD^2 score: 0.01853420968861652 for bandwidth: 0.35 MMD^2 score: 0.014411663277056563 for bandwidth: 0.4 MMD^2 score: 0.017518280459942748 --------Data_size: Medium ----------- for bandwidth: 0.01 MMD^2 score: 0.004301337898196311 for bandwidth: 0.05 MMD^2 score: 0.0023686525534216685 for bandwidth: 0.1 MMD^2 score: 0.003316046723267796 for bandwidth: 0.15 MMD^2 score: 0.0034443858511293834 for bandwidth: 0.2 MMD^2 score: 0.00376625869288183 for bandwidth: 0.25 MMD^2 score: 0.004871164915991488 for bandwidth: 0.3 MMD^2 score: 0.006049072647813292 for bandwidth: 0.35 MMD^2 score: 0.007820356559834263 for bandwidth: 0.4 MMD^2 score: 0.008314072709681763 --------Data_size: Large ----------- for bandwidth: 0.01 MMD^2 score: 0.0015842704000799124 for bandwidth: 0.05 MMD^2 score: 0.0024984858386801756 for bandwidth: 0.1 MMD^2 score: 0.0013548147448194658 for bandwidth: 0.15 MMD^2 score: 0.0017721989809129646 for bandwidth: 0.2 MMD^2 score: 0.0028602606360367266 for bandwidth: 0.25 MMD^2 score: 0.0034149266742496964 for bandwidth: 0.3 MMD^2 score: 0.004455007301260472 for bandwidth: 0.35 MMD^2 score: 0.005633445445235363 for bandwidth: 0.4 MMD^2 score: 0.007897413444463042
plot_dict_of_dicts(data=mmd_kde_model_inv_quad,
x_label="bandwidth (h) for KDE",
y_label="MMD_squared",
title="MMD^{squared} (with inverse_multi_quadratic_kernel) for KDE for dataset of different lengths ")
In the case of both GMMs with small L (number of mixtures) and KDE with otherwise not good bandwidth (>0.25 in our case), having large number of data-points is favourable!
We expected (and as also can be seen from the plots above) the single gaussian to act the worse (as can be seen from l=1 case in the GMM) as our dataset is quite 'multimodal' so to speak which it clearly cannot capture.
We would like to point out that in our code, we missed providing the calculation for $mmd^{2}$ in the histogram and single Gaussian fit.
We have calculated the $mmd^{2}$ caln (with different kernels) for both GMM and KDE.
Although our results for GMM model are similar (in trend), we are not quite sure about the results for the KDE provided in the sample solution. What we observed was that for the KDE, bandwidth around 0.1 was the suitable and as we increased (and decreased) it, the performance of the KDE worsened. This is something we also expect as increasing the bandwidth results in oversmoothing, which gets clearly reflected in our plots, Whereas what the sample solution shows that increasing the bandwidth after 0.1 results in lowering of the value of MMD_squared, something we dont quite expect.
General Commnent for the task 1: Except for the histogram, the remaining three models were implemented and trained in the manner mostly similar to the sample solution. The sample solution is coded in a much more formal, Object-oriented format, whereas we adopted the modular approach for the sake of flexibility. Thouguh there were some improvements which we have provided above in the comments that would make our implementation more fast and robust.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
Let's load the dataset and split in train and test set and analyze how well the digits are distributed:
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2)
# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_train)
# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()
We see, that the digits are in evenly distributed frequency in the train dataset
def MMD(x, y, kernel):
"""Emprical maximum mean discrepancy. The lower the result
the more evidence that distributions are the same.
Args:
x: first sample, distribution P
y: second sample, distribution Q
kernel: kernel type such as "rbf" or "inverse-multi-quadratic"
"""
# Compute squared Euclidean distances
xx, yy, zz = np.dot(x, x.T), np.dot(y, y.T), np.dot(x, y.T)
rx = np.diag(xx).reshape(1, -1).repeat(xx.shape[0], axis=0)
ry = np.diag(yy).reshape(1, -1).repeat(yy.shape[0], axis=0)
dxx = rx.T + rx - 2. * xx # Used for A in (1)
dyy = ry.T + ry - 2. * yy # Used for B in (1)
dxy = rx.T + ry - 2. * zz # Used for C in (1)
XX, YY, XY = (np.zeros_like(xx),
np.zeros_like(xx),
np.zeros_like(xx))
# Squared Exponential or Gaussian Kernel
if kernel == "rbf":
#bandwidth_range = [10, 15, 20, 50,100]
bandwidth_range = [10, 15, 20, 25, 30]
XX = sum(np.exp(-0.5 * dxx / a) for a in bandwidth_range)
YY = sum(np.exp(-0.5 * dyy / a) for a in bandwidth_range)
XY = sum(np.exp(-0.5 * dxy / a) for a in bandwidth_range)
# Inverse Multi-Quadratic Kernel
if kernel == "inverse-multi-quadratic":
#bandwidth_range = [10, 15, 20, 50,100]
bandwidth_range = [10, 15, 20, 25, 30]
XX = sum(1 / (dxx / a + 1) for a in bandwidth_range)
YY = sum(1 / (dyy / a + 1) for a in bandwidth_range)
XY = sum(1 / (dxy / a + 1) for a in bandwidth_range)
return np.mean(XX + YY - 2. * XY)
Readability and Modularity: The sample solution separates the kernel functions (squared_exponential_kernel and inverse_multi_quadratic_kernel) from the main MMD calculation function (mmd2). This makes the code more modular and potentially easier to read and maintain.
Performance: Our implementation (MMD function) uses vectorized operations and takes advantage of NumPy's capabilities for efficient array computations. This could lead to better performance, especially for large datasets, compared to the nested list comprehensions in the sample solution.
As a first generative network we try to implement the Density Forest implemtation from https://github.com/kfritsch/density_forest
from DensityForest import DensityForest
X = X_train
density_forest = DensityForest(n_estimators=20)
density_forest.fit(X)
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) c:\Users\luke\OneDrive\Dokumente\UniHeidelberg\Master\Semester3\Generative Neural Networks\code\Assignment_1-Feedback\generative-baselines-commented.ipynb Cell 63 line 1 ----> <a href='vscode-notebook-cell:/c%3A/Users/luke/OneDrive/Dokumente/UniHeidelberg/Master/Semester3/Generative%20Neural%20Networks/code/Assignment_1-Feedback/generative-baselines-commented.ipynb#Y114sZmlsZQ%3D%3D?line=0'>1</a> from DensityForest import DensityForest <a href='vscode-notebook-cell:/c%3A/Users/luke/OneDrive/Dokumente/UniHeidelberg/Master/Semester3/Generative%20Neural%20Networks/code/Assignment_1-Feedback/generative-baselines-commented.ipynb#Y114sZmlsZQ%3D%3D?line=1'>2</a> X = X_train <a href='vscode-notebook-cell:/c%3A/Users/luke/OneDrive/Dokumente/UniHeidelberg/Master/Semester3/Generative%20Neural%20Networks/code/Assignment_1-Feedback/generative-baselines-commented.ipynb#Y114sZmlsZQ%3D%3D?line=2'>3</a> density_forest = DensityForest(n_estimators=20) ModuleNotFoundError: No module named 'DensityForest'
After a few hours of bug fixing in the code, since it had semantic and syntax errors. We are now at a point, were the networks starts training, but we still run into an additional Error in the Node implementation. Since fixing the semantic error of the Node class and potential further errors and a seperate implementation of the sampling method would be over the scope of the first assignment we decided to skip the density forest implementation and will continue with the single gaussian.
Now we implement a single Gaussian model, by using the 'GaussianMixture' model from scikit-learn with just one component
n_components = 1
single_gaussian = GaussianMixture(n_components=n_components)
single_gaussian.fit(X_train)
new_samples, _ = single_gaussian.sample(len(X_test))
result_rbf = MMD(X_test,new_samples, "rbf")
result_inv = MMD(X_test,new_samples, "inverse-multi-quadratic")
print(f"MMD with squared exponential kernel: {result_rbf.item()}")
print(f"MMD with inverse multi quadratic kernel: {result_inv.item()}")
new_samples, _ = single_gaussian.sample(10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
ax.imshow(new_samples[i].reshape(8, 8), cmap=plt.cm.gray)
ax.set_title(f"Sample {i}")
plt.show()
MMD with squared exponential kernel: 0.027877016283483855 MMD with inverse multi quadratic kernel: 0.03021700764199574
We can see that the single Gaussian is already capable of generating number like images, with some samples indicate legible numbers with higher noise. However, it clearly still lacks the capability to generate decent quality images in the majority of samples. Next, we will analyze the quality of generated dependend on the dataset size
# Create a list of dataset sizes you want to use
dataset_sizes = [0.1, 0.3, 0.5, 0.8, 1.0] # Fraction of the original dataset size
rbf_gaussian = []
inv_gaussian = []
var_data_gaussian = GaussianMixture(n_components=1)
for size in dataset_sizes:
# Split the dataset into a training sets with the desired size
if size == 1.0:
X, _, y, _ = train_test_split(X_train, y_train, test_size=None)
else:
X, _, y, _ = train_test_split(X_train, y_train, test_size=1 - size)
# Train the classifier on the current dataset size
var_data_gaussian.fit(X)
new_samples, _ = var_data_gaussian.sample(len(X))
rbf_gaussian.append(MMD(X,new_samples, "rbf"))
inv_gaussian.append(MMD(X,new_samples, "inverse-multi-quadratic"))
# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(dataset_sizes, rbf_gaussian, label="RBF MMD")
plt.plot(dataset_sizes, inv_gaussian, label="Inv. Multi-Quadratic MMD")
plt.title("MMD vs. Dataset Size")
plt.xlabel("dataset size")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)
c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3. warnings.warn(
We see, the model improves the more training data it uses
Let's continue with training a Gaussian mixture model (GMM) and start by using 50 gaussian components
n_components = 55
gmm = GaussianMixture(n_components=n_components)
gmm.fit(X_train)
new_samples, _ = gmm.sample(len(X_test))
result_rbf = MMD(X_test,new_samples, "rbf")
result_inv = MMD(X_test,new_samples, "inverse-multi-quadratic")
print(f"MMD with squared exponential kernel: {result_rbf.item()}")
print(f"MMD with inverse multi quadratic kernel: {result_inv.item()}")
# plot generated images
new_samples, _ = gmm.sample(10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
ax.imshow(new_samples[i].reshape(8, 8), cmap=plt.cm.gray)
ax.set_title(f"Sample {i}")
plt.show()
MMD with squared exponential kernel: 0.0278568518776322 MMD with inverse multi quadratic kernel: 0.028092643664431385
Now we investigate the hyperparameter number of components (n_components) in an grid search attempt to find a good fit
n_components = list(range(5, 101, 5))
rbf_mmd = []
inv_mmd = []
for _, n in enumerate(n_components):
n_gmm = GaussianMixture(n_components=n)
n_gmm.fit(X_train)
new_samples, _ = n_gmm.sample(len(X_test))
rbf_mmd.append(MMD(X_test,new_samples, "rbf"))
inv_mmd.append(MMD(X_test,new_samples, "inverse-multi-quadratic"))
# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(n_components, rbf_mmd, label="RBF MMD")
plt.plot(n_components, inv_mmd, label="Inv. Multi-Quadratic MMD")
plt.title("MMD vs. n_components")
plt.xlabel("n_components")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)
plt.show()
Furthe we test the influence of the dataset size with the optimal n_component size
# Create a list of dataset sizes you want to use
dataset_sizes = [0.1, 0.3, 0.5, 0.8, 1.0] # Fraction of the original dataset size
rbf_gmm = []
inv_gmm = []
var_data_gmm = GaussianMixture(n_components=55)
for size in dataset_sizes:
# Split the dataset into a training sets with the desired size
if size == 1.0:
X, _, y, _ = train_test_split(X_train, y_train, test_size=None)
else:
X, _, y, _ = train_test_split(X_train, y_train, test_size=1 - size)
# Train the classifier on the current dataset size
var_data_gmm.fit(X)
new_samples, _ = var_data_gmm.sample(len(X))
rbf_gmm.append(MMD(X,new_samples, "rbf"))
inv_gmm.append(MMD(X,new_samples, "inverse-multi-quadratic"))
# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(dataset_sizes, rbf_gmm, label="RBF MMD")
plt.plot(dataset_sizes, inv_gmm, label="Inv. Multi-Quadratic MMD")
plt.title("MMD vs. Dataset Size")
plt.xlabel("dataset size")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)
c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn( c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3. warnings.warn(
Also here the model improves with dataset size
Now we implement a kernel density estimator (KDE) with squared exponential kernel
from sklearn.neighbors import KernelDensity
bandwidth = 1.1
kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
kde.fit(X_train)
new_samples = kde.sample(len(X_test))
result_rbf = MMD(X_test,new_samples, "rbf")
result_inv = MMD(X_test,new_samples, "inverse-multi-quadratic")
print(f"MMD with squared exponential kernel: {result_rbf.item()}")
print(f"MMD with inverse multi quadratic kernel: {result_inv.item()}")
new_samples = kde.sample(10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
ax.imshow(new_samples[i].reshape(8, 8), cmap=plt.cm.gray)
ax.set_title(f"Sample {i}")
plt.show()
MMD with squared exponential kernel: 0.02797107095274239 MMD with inverse multi quadratic kernel: 0.02836541329836265
bandwidths = np.arange(0.1, 2.6, 0.1)
rbf_mmd = []
inv_mmd = []
for _, bandwidth in enumerate(bandwidths):
n_kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
n_kde.fit(X_train)
new_samples = n_kde.sample(len(X_test))
rbf_mmd.append(MMD(X_test,new_samples, "rbf"))
inv_mmd.append(MMD(X_test,new_samples, "inverse-multi-quadratic"))
# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(bandwidths, rbf_mmd, label="RBF MMD")
plt.plot(bandwidths, inv_mmd, label="Inv. Multi-Quadratic MMD")
plt.title("MMD vs. bandwidth")
plt.xlabel("bandwith")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)
plt.show()
# Create a list of dataset sizes you want to use
dataset_sizes = [0.1, 0.3, 0.5, 0.8, 1.0] # Fraction of the original dataset size
rbf_kde = []
inv_kde = []
var_data_gaussian = KernelDensity(bandwidth=1.1, kernel='gaussian')
for size in dataset_sizes:
# Split the dataset into a training sets with the desired size
if size == 1.0:
X, _, y, _ = train_test_split(X_train, y_train, test_size=None)
else:
X, _, y, _ = train_test_split(X_train, y_train, test_size=1 - size)
# Train the classifier on the current dataset size
var_data_gaussian.fit(X)
new_samples = var_data_gaussian.sample(len(X))
rbf_kde.append(MMD(X,new_samples, "rbf"))
inv_kde.append(MMD(X,new_samples, "inverse-multi-quadratic"))
# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(dataset_sizes, rbf_kde, label="RBF MMD")
plt.plot(dataset_sizes, inv_kde, label="Inv. Multi-Quadratic MMD")
plt.title("MMD vs. Dataset Size")
plt.xlabel("dataset size")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)
Finally we implement a Random forest classifier on the original dataset to destinguish the 10 digits. Further we will use this classifier to check if the models are working reasonably and that the 10 digits are generated in equal proportion
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
from sklearn.metrics import accuracy_score
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test data: {accuracy:.2f}")
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
ax.imshow(X_test[i].reshape(8, 8), cmap=plt.cm.gray)
ax.set_title(f"True: {y_test[i]} Pred: {y_pred[i]}")
plt.show()
y_pred = rf_classifier.predict(X_train)
# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred)
# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers in the Train Dataset")
plt.show()
Accuracy on the test data: 0.99
We can see, the Classifier does a really good job on classifying the test dataset. Also the frequency of each predicted class in the training dataset is very evanly spaced out. So now, lets test the classifier on generated data samples from each model!
sample_size = 500
samples_gauss, _ = single_gaussian.sample(sample_size)
samples_gmm, _ = gmm.sample(sample_size)
samples_kde = kde.sample(sample_size)
y_pred_gauss = rf_classifier.predict(samples_gauss)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
ax.imshow(samples_gauss[i].reshape(8, 8), cmap=plt.cm.gray)
ax.set_title(f"Pred: {y_pred_gauss[i]}")
fig.suptitle("Predictions for Single Gaussian Samples", fontsize=16)
plt.show()
# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred_gauss)
# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()
y_pred_gmm = rf_classifier.predict(samples_gmm)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
ax.imshow(samples_gmm[i].reshape(8, 8), cmap=plt.cm.gray)
ax.set_title(f"Pred: {y_pred_gmm[i]}")
fig.suptitle("Predictions for GMM Samples", fontsize=16)
plt.show()
# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred_gmm)
# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()
y_pred_kde = rf_classifier.predict(samples_kde)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
ax.imshow(samples_kde[i].reshape(8, 8), cmap=plt.cm.gray)
ax.set_title(f"Pred: {y_pred_kde[i]}")
fig.suptitle("Predictions for GMM Samples", fontsize=16)
plt.show()
# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred_kde)
# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()
From the plot we can see, that the single Gaussian performs the worst on the training dataset since a disproportinal amount of the generated images get classified as the digit 8. The GMM and the KDE both have substentially better results, where the generated images are more evanly classified throughout the classes, suggesting better generated data samples
General comment for task 2: The task was similary solved with similar performance only missing the Density Tree implementation. The sample soultion is more readable and significantly reduces code length since using for loops for all models. However the sample solution is able to achieve this, because it does not implement the testing of different dataset sizes and bandwidth and n_component sizes